pacman::p_load(sf, spdep, GWmodel, SpatialML,
tmap, rsample, Metrics, tidyverse,
knitr, kableExtra)In Class Exercise 8 - Geographically Weighted Predictive Modelling
Part 1 : Load Data and Packages
mdata <- read_rds("data/mdata.rds")set.seed(1234)
# Jitter the data
for (i in 1:nrow(mdata)) {
coords <- st_coordinates(mdata[i, ])
jittered_coords <- coords + runif(n = 1, min = -0.1, max = 0.1)
mdata[i,]$geometry <- st_sfc(st_point(c(jittered_coords), dim = "XY"), crs = 3414)
}
resale_split <- initial_split(mdata,
prop = 6.5/10,)
train_data <- training(resale_split)
test_data <- testing(resale_split)mdata_nogeo <- mdata %>% st_drop_geometry()
ggstatsplot::ggcorrmat(mdata_nogeo[, 2:17])
Part 2 : Linear Regression
price_mlr <- lm(resale_price ~ floor_area_sqm +
storey_order + remaining_lease_mths +
PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
PROX_MRT + PROX_PARK + PROX_MALL +
PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
WITHIN_1KM_PRISCH,
data=train_data)
# Prnt out a nice Report
olsrr::ols_regress(price_mlr) Model Summary
--------------------------------------------------------------------------
R 0.864 RMSE 60891.465
R-Squared 0.746 MSE 3713159666.904
Adj. R-Squared 0.746 Coef. Var 14.068
Pred R-Squared 0.746 AIC 257079.714
MAE 47027.870 SBC 257195.606
--------------------------------------------------------------------------
RMSE: Root Mean Square Error
MSE: Mean Square Error
MAE: Mean Absolute Error
AIC: Akaike Information Criteria
SBC: Schwarz Bayesian Criteria
ANOVA
--------------------------------------------------------------------------------
Sum of
Squares DF Mean Square F Sig.
--------------------------------------------------------------------------------
Regression 1.128008e+14 14 8.057201e+12 2169.904 0.0000
Residual 3.831981e+13 10320 3713159666.904
Total 1.511206e+14 10334
--------------------------------------------------------------------------------
Parameter Estimates
------------------------------------------------------------------------------------------------------------------
model Beta Std. Error Std. Beta t Sig lower upper
------------------------------------------------------------------------------------------------------------------
(Intercept) 116657.901 10562.315 11.045 0.000 95953.716 137362.087
floor_area_sqm 2755.393 89.663 0.162 30.730 0.000 2579.635 2931.150
storey_order 14158.799 333.897 0.230 42.405 0.000 13504.297 14813.301
remaining_lease_mths 346.494 4.556 0.442 76.058 0.000 337.564 355.424
PROX_CBD -17205.198 199.202 -0.593 -86.371 0.000 -17595.672 -16814.724
PROX_ELDERLYCARE -13344.152 979.799 -0.074 -13.619 0.000 -15264.749 -11423.555
PROX_HAWKER -19225.464 1267.105 -0.082 -15.173 0.000 -21709.235 -16741.693
PROX_MRT -34996.049 1724.248 -0.112 -20.296 0.000 -38375.908 -31616.189
PROX_PARK -5931.813 1455.400 -0.022 -4.076 0.000 -8784.680 -3078.947
PROX_MALL -17450.175 1999.149 -0.052 -8.729 0.000 -21368.896 -13531.455
PROX_SUPERMARKET -26563.411 4115.914 -0.034 -6.454 0.000 -34631.401 -18495.422
WITHIN_350M_KINDERGARTEN 7846.864 625.644 0.066 12.542 0.000 6620.482 9073.247
WITHIN_350M_CHILDCARE -4357.283 351.829 -0.071 -12.385 0.000 -5046.935 -3667.630
WITHIN_350M_BUS 886.959 221.590 0.021 4.003 0.000 452.600 1321.317
WITHIN_1KM_PRISCH -8731.084 483.917 -0.110 -18.043 0.000 -9679.655 -7782.512
------------------------------------------------------------------------------------------------------------------
# Also use package easystats for modelling visualization and reporting
vif <- performance::check_collinearity(price_mlr)
kable(vif, caption = "Variance Inflation Factor (VIF) Results") %>% kable_styling(font_size = 18)| Term | VIF | VIF_CI_low | VIF_CI_high | SE_factor | Tolerance | Tolerance_CI_low | Tolerance_CI_high |
|---|---|---|---|---|---|---|---|
| floor_area_sqm | 1.129928 | 1.107808 | 1.156586 | 1.062981 | 0.8850121 | 0.8646133 | 0.9026833 |
| storey_order | 1.197418 | 1.172317 | 1.226175 | 1.094266 | 0.8351303 | 0.8155443 | 0.8530114 |
| remaining_lease_mths | 1.375186 | 1.342965 | 1.410433 | 1.172683 | 0.7271745 | 0.7090019 | 0.7446210 |
| PROX_CBD | 1.915262 | 1.862374 | 1.971395 | 1.383930 | 0.5221217 | 0.5072551 | 0.5369492 |
| PROX_ELDERLYCARE | 1.195039 | 1.170038 | 1.223715 | 1.093178 | 0.8367930 | 0.8171838 | 0.8546730 |
| PROX_HAWKER | 1.194439 | 1.169464 | 1.223095 | 1.092904 | 0.8372129 | 0.8175978 | 0.8550926 |
| PROX_MRT | 1.243143 | 1.216156 | 1.273500 | 1.114963 | 0.8044123 | 0.7852373 | 0.8222629 |
| PROX_PARK | 1.202157 | 1.176857 | 1.231075 | 1.096429 | 0.8318384 | 0.8122981 | 0.8497208 |
| PROX_MALL | 1.422083 | 1.388040 | 1.459112 | 1.192511 | 0.7031939 | 0.6853483 | 0.7204403 |
| PROX_SUPERMARKET | 1.160541 | 1.137026 | 1.188092 | 1.077284 | 0.8616670 | 0.8416858 | 0.8794875 |
| WITHIN_350M_KINDERGARTEN | 1.121277 | 1.099572 | 1.147712 | 1.058903 | 0.8918406 | 0.8712983 | 0.9094449 |
| WITHIN_350M_CHILDCARE | 1.326777 | 1.296452 | 1.360204 | 1.151858 | 0.7537061 | 0.7351839 | 0.7713358 |
| WITHIN_350M_BUS | 1.151343 | 1.128237 | 1.178612 | 1.073006 | 0.8685509 | 0.8484557 | 0.8863383 |
| WITHIN_1KM_PRISCH | 1.526322 | 1.488261 | 1.567349 | 1.235444 | 0.6551699 | 0.6380201 | 0.6719251 |
# Anything > 5 would mean some collinear, more than 10 shoud just exclude
plot(vif) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Variable `Component` is not in your data frame :/

Part 3 : Predictive Modeling
bw_adaptive <- bw.gwr(resale_price ~ floor_area_sqm +
storey_order + remaining_lease_mths +
PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
PROX_MRT + PROX_PARK + PROX_MALL +
PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
WITHIN_350M_CHILDCARE +
WITHIN_1KM_PRISCH,
data=train_data,
approach="CV",
kernel="gaussian",
adaptive=TRUE,
longlat=FALSE)Take a cup of tea and have a break, it will take a few minutes.
-----A kind suggestion from GWmodel development group
Adaptive bandwidth: 6395 CV score: 3.530305e+13
Adaptive bandwidth: 3960 CV score: 3.255536e+13
Adaptive bandwidth: 2455 CV score: 2.880938e+13
Adaptive bandwidth: 1524 CV score: 2.513316e+13
Adaptive bandwidth: 950 CV score: 1.933083e+13
Adaptive bandwidth: 593 CV score: 1.558231e+13
Adaptive bandwidth: 375 CV score: 1.280445e+13
Adaptive bandwidth: 237 CV score: 1.07939e+13
Adaptive bandwidth: 155 CV score: 9.258612e+12
Adaptive bandwidth: 101 CV score: 8.158468e+12
Adaptive bandwidth: 71 CV score: 7.384932e+12
Adaptive bandwidth: 49 CV score: 6.720673e+12
Adaptive bandwidth: 38 CV score: 6.442666e+12
Adaptive bandwidth: 29 CV score: Inf
Adaptive bandwidth: 41 CV score: 6.527444e+12
Adaptive bandwidth: 33 CV score: Inf
Adaptive bandwidth: 38 CV score: 6.442666e+12
gwr_adaptive <- gwr.basic(formula = resale_price ~
floor_area_sqm + storey_order +
remaining_lease_mths + PROX_CBD +
PROX_ELDERLYCARE + PROX_HAWKER +
PROX_MRT + PROX_PARK + PROX_MALL +
PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
WITHIN_350M_CHILDCARE + WITHIN_1KM_PRISCH,
data=train_data,
bw=bw_adaptive,
kernel = 'gaussian',
adaptive=TRUE,
longlat = FALSE)gwr_bw_test_adaptive <- bw.gwr(resale_price ~ floor_area_sqm +
storey_order + remaining_lease_mths +
PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
PROX_MRT + PROX_PARK + PROX_MALL +
PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
WITHIN_350M_CHILDCARE +
WITHIN_1KM_PRISCH,
data=test_data,
approach="CV",
kernel="gaussian",
adaptive=TRUE,
longlat=FALSE)Take a cup of tea and have a break, it will take a few minutes.
-----A kind suggestion from GWmodel development group
Adaptive bandwidth: 3447 CV score: 1.990467e+13
Adaptive bandwidth: 2138 CV score: 1.82957e+13
Adaptive bandwidth: 1328 CV score: 1.617623e+13
Adaptive bandwidth: 828 CV score: 1.417383e+13
Adaptive bandwidth: 518 CV score: 1.121439e+13
Adaptive bandwidth: 327 CV score: 9.053746e+12
Adaptive bandwidth: 208 CV score: 7.571744e+12
Adaptive bandwidth: 135 CV score: 6.593324e+12
Adaptive bandwidth: 89 CV score: 5.817039e+12
Adaptive bandwidth: 62 CV score: 5.312371e+12
Adaptive bandwidth: 43 CV score: 4.845013e+12
Adaptive bandwidth: 34 CV score: 4.64567e+12
Adaptive bandwidth: 25 CV score: 4.542503e+12
Adaptive bandwidth: 23 CV score: 4.571456e+12
Adaptive bandwidth: 30 CV score: 4.561981e+12
Adaptive bandwidth: 25 CV score: 4.542503e+12
# Im still getting a singular matrix error despite testing and debugging
#gwr_pred <- gwr.predict(formula = resale_price ~
# floor_area_sqm + storey_order +
# remaining_lease_mths + PROX_CBD +
# PROX_ELDERLYCARE + PROX_HAWKER +
# PROX_MRT + PROX_PARK + PROX_MALL +
# PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
# WITHIN_350M_CHILDCARE +
# WITHIN_1KM_PRISCH,
# data= train_data,
# predictdata = test_data,
# bw=bw_adaptive,
# kernel = 'gaussian',
# adaptive= FALSE,
# longlat = FALSE)Part 4 : Spatial-ML Methods
Get Coordinates Train-Test Split
coords <- st_coordinates(mdata)
coords_train <- st_coordinates(train_data)
coords_test <- st_coordinates(test_data)Remove Geometry
train_data_nogeom <- train_data %>% st_drop_geometry()Do Random Forests
set.seed(1234)
rf <- ranger(resale_price ~ floor_area_sqm + storey_order +
remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE +
PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL +
PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
WITHIN_1KM_PRISCH,
data=train_data_nogeom)
rfRanger result
Call:
ranger(resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH, data = train_data_nogeom)
Type: Regression
Number of trees: 500
Sample size: 10335
Number of independent variables: 14
Mtry: 3
Target node size: 5
Variable importance mode: none
Splitrule: variance
OOB prediction error (MSE): 708877837
R squared (OOB): 0.9515252
set.seed(1234)
gwRF_adaptive <- grf(formula = resale_price ~ floor_area_sqm + storey_order +
remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE +
PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL +
PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
WITHIN_1KM_PRISCH,
dframe=train_data_nogeom,
bw=55,
kernel="adaptive",
coords=coords_train)
Number of Observations: 10335
Number of Independent Variables: 14
Kernel: Adaptive
Neightbours: 55
--------------- Global ML Model Summary ---------------
Ranger result
Call:
ranger(resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH, data = train_data_nogeom, num.trees = 500, mtry = 4, importance = "impurity", num.threads = NULL)
Type: Regression
Number of trees: 500
Sample size: 10335
Number of independent variables: 14
Mtry: 4
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 672555845
R squared (OOB): 0.954009
Importance:
floor_area_sqm storey_order remaining_lease_mths
6.957558e+12 1.445835e+13 2.942620e+13
PROX_CBD PROX_ELDERLYCARE PROX_HAWKER
5.480924e+13 7.166729e+12 5.683696e+12
PROX_MRT PROX_PARK PROX_MALL
8.141868e+12 5.006469e+12 4.205909e+12
PROX_SUPERMARKET WITHIN_350M_KINDERGARTEN WITHIN_350M_CHILDCARE
2.769281e+12 9.319988e+11 1.693840e+12
WITHIN_350M_BUS WITHIN_1KM_PRISCH
1.604206e+12 7.118579e+12
Mean Square Error (Not OOB): 165916724.251
R-squared (Not OOB) %: 98.865
AIC (Not OOB): 195640.509
AICc (Not OOB): 195640.556
--------------- Local Model Summary ---------------
Residuals OOB:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-216953.8 -14428.6 606.2 218.1 14941.9 227138.0
Residuals Predicted (Not OOB):
Min. 1st Qu. Median Mean 3rd Qu. Max.
-82542.08 -3533.79 78.63 29.87 3746.36 63248.20
Local Variable Importance:
Min Max Mean StD
floor_area_sqm 0 347702688833 16952096525 36996489828
storey_order 242142687 289172735788 16426959259 24298165408
remaining_lease_mths 440657099 579272913133 32619674467 65603975965
PROX_CBD 112594042 340583744430 11604192564 25726882688
PROX_ELDERLYCARE 101476719 332688632806 10261577749 22659387882
PROX_HAWKER 103893583 299569897721 9905264951 20939125860
PROX_MRT 98947793 269150464299 10276749397 23591023441
PROX_PARK 107229395 347121976193 9559739547 21906746329
PROX_MALL 105250860 437631810372 11595469504 28316050893
PROX_SUPERMARKET 112316285 342234698506 9831051060 24793192793
WITHIN_350M_KINDERGARTEN 0 219947190500 2793653229 13546728631
WITHIN_350M_CHILDCARE 0 210593671219 5081883890 17359704011
WITHIN_350M_BUS 0 215473141324 4445498438 11521982207
WITHIN_1KM_PRISCH 0 179945964150 1716983401 7296404134
Mean squared error (OOB): 792305097.809
R-squared (OOB) %: 94.581
AIC (OOB): 211798.874
AICc (OOB): 211798.921
Mean squared error Predicted (Not OOB): 67123193.54
R-squared Predicted (Not OOB) %: 99.541
AIC Predicted (Not OOB): 186287.785
AICc Predicted (Not OOB): 186287.832
Calculation time (in seconds): 9.6303
test_data_nogeom <- cbind(test_data, coords_test) %>% st_drop_geometry()gwRF_pred <- predict.grf(gwRF_adaptive,
test_data_nogeom,
x.var.name="X",
y.var.name="Y",
local.w=1,
global.w=0)Compare against test data
GRF_pred_df <- as.data.frame(gwRF_pred)
test_data_pred <- cbind(test_data,
GRF_pred_df)Show test prediction over and under predict
rmse(test_data_pred$resale_price,
test_data_pred$gwRF_pred)[1] 28677.78
ggplot(data = test_data_pred,
aes(x = gwRF_pred,
y = resale_price)) +
geom_point()
Show by location
test_data_pred$residuals <- test_data_pred$gwRF_pred - test_data_pred$resale_price
st_crs(test_data_pred)Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
# Load in the mpsz data
mpsz = st_read(dsn = "data/geospatial", layer = "MP14_SUBZONE_WEB_PL") %>% st_transform(3414)Reading layer `MP14_SUBZONE_WEB_PL' from data source
`C:\Users\Admin\Desktop\SMU\ISSS626\ISSS626-KierenChua\InClassEx\InClassEx08\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 323 features and 15 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 2667.538 ymin: 15748.72 xmax: 56396.44 ymax: 50256.33
Projected CRS: SVY21
# plot on tmap
tmap_mode("view")tmap mode set to interactive viewing
tm_shape(mpsz) +
tmap_options(check.and.fix = TRUE) +
tm_polygons(alpha = 0.4) +
tm_shape(test_data_pred) +
tm_dots(col = "residuals",
alpha = 0.6,
style = "quantile")Warning: The shape mpsz is invalid (after reprojection). See sf::st_is_valid
Variable(s) "residuals" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("plot")tmap mode set to plotting